majors_processed %>%
count(Major_category, sort = T)
## # A tibble: 16 x 2
## Major_category n
## <chr> <int>
## 1 Engineering 29
## 2 Education 16
## 3 Humanities & Liberal Arts 15
## 4 Biology & Life Science 14
## 5 Business 13
## 6 Health 12
## 7 Computers & Mathematics 11
## 8 Agriculture & Natural Resources 10
## 9 Physical Sciences 10
## 10 Psychology & Social Work 9
## 11 Social Science 9
## 12 Arts 8
## 13 Industrial Arts & Consumer Services 7
## 14 Law & Public Policy 5
## 15 Communications & Journalism 4
## 16 Interdisciplinary 1
by_major_category <- majors_processed %>%
filter(!is.na(Total)) %>%
group_by(Major_category) %>%
summarize(Men = sum(Men),
Women = sum(Women),
Total = sum(Total),
MedianSalary = sum(Median*Sample_size)/sum(Sample_size)) %>%
mutate(ShareWomen = Women / Total) %>%
arrange(desc(ShareWomen))
majors_processed %>%
ggplot(aes(Median)) +
geom_histogram(fill="dodgerblue")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
library(ggrepel)
majors_processed %>%
mutate(Major_category=fct_reorder(Major_category,ShareWomen)) %>%
ggplot(aes(fct_lump(Major_category, 20), ShareWomen, fill =
Major_category),
label=ShareWomen) +
geom_boxplot(show.legend = F) +
coord_flip() +
ggtitle("Metallurgical mix")+
theme_bw() +
geom_hline(yintercept = 0.153, lty = 2)+
annotate("text",label="15.3% female",x=9,y=0.1,angle=90)+
xlab("")
majors_processed %>% # BOXPLOT COMPARING EARNINGS ACROSS MAJOR CATEGORY
mutate(Major_category = fct_reorder(Major_category, Median)) %>%
ggplot(aes(Major_category, Median,fill=Major_category)) +
geom_boxplot(show.legend = F) +
coord_flip() +
xlab("")+
scale_y_log10(labels = scales::dollar_format())
majors_processed %>%
group_by(Major_category) %>%
summarise(Median=median(Median)) %>%
mutate(Major_category = fct_reorder(Major_category, Median)) %>%
ggplot(aes(Major_category,Median,fill=Major_category))+
geom_col(show.legend = FALSE)+
coord_flip()
majors_processed %>% # INTERQUARTILE RANGE OF HIGHEST EARNING MAJORS
arrange(desc(Median)) %>%
select(Major,Major_category,Median,P25th,P75th) %>%
head(20) %>%
mutate(Major=fct_reorder(Major,Median)) %>%
ggplot(aes(Major,Median,colour=Major_category))+
geom_point()+
coord_flip()+
geom_errorbar(aes(ymin=P25th,ymax=P75th))+
expand_limits(y=0)
This is of interest with my background in Finance
majors_processed %>%
filter(Major_category=='Business') %>%
mutate(Major=fct_reorder(Major,Median)) %>%
ggplot(aes(Major,Median,fill=Major)) +
geom_col(show.legend=FALSE)+
xlab("")+
coord_flip()+
ggtitle("Highest earning majors in business")
library(ggrepel)
majors_processed %>%
arrange(ShareWomen) %>%
head(20) %>%
mutate(Major=fct_reorder(Major,ShareWomen)) %>%
ggplot(aes(Major,ShareWomen))+
geom_point()+
coord_flip()
majors_processed %>%
count(Major_category, wt= Total, sort=TRUE) %>%
mutate(Major_category = fct_reorder(Major_category, n)) %>%
ggplot(aes(Major_category, n, fill = Major_category))+
geom_col()+
coord_flip()+
labs(title="Most common major categories",
x="",
y="Total number of graduates"
)+
scale_y_continuous(label = scales::comma_format())+
theme_bw()+
theme(legend.position="none")
majors_processed %>%
mutate(Major=fct_reorder(Major,Total)) %>%
arrange(desc(Total)) %>%
head(20) %>%
ggplot(aes(Major,Total,fill=Major))+
geom_col()+
theme(legend.position = "none")+
scale_y_continuous(label=scales::comma_format())+
coord_flip()
majors_processed %>%
arrange(desc(Total)) %>%
head(20) %>%
mutate(Major=fct_reorder(Major, Total)) %>%
gather(Gender, Number, Men, Women) %>%
ggplot(aes(Major, Number, fill = Gender))+
geom_col()+
scale_y_continuous(label=scales::comma_format())+
coord_flip()
by_major_category %>%
ggplot(aes(ShareWomen,MedianSalary))+
geom_point()+
geom_smooth(method="lm")+
geom_text_repel(aes(label=Major_category),force=0.2)+expand_limits(y=0)+
labs(title="Relationship between median salary and proportion of women",
subtitle="Ordered by major category")
## `geom_smooth()` using formula 'y ~ x'
library(plotly)
g <- majors_processed %>%
mutate(Major_category=fct_lump(Major_category,7)) %>%
ggplot(aes(ShareWomen,Median, colour= Major_category,size=Sample_size))+
geom_point(aes(label=Major))+
geom_smooth(aes(group=1),method="lm")+
expand_limits(y=0)+
labs(title="Relationship between median salary and proportion of women",
subtitle = "Ordered by major and coloured by major category",
x="Percentage share of women in the field",
y="Median salary income")+
theme(legend.position = 'bottom')+
#annotate("text",x=0.8,y=max(majors_processed$Median)-2000,hjust=1,vjust=1,label="Size refers to sample size")+
#annotate("text",x=0.8,y=max(majors_processed$Median)-8000,hjust=1,vjust=1,label="Colour refers to major category")+
scale_y_continuous(labels=scales::dollar_format())+
scale_x_continuous(labels=scales::percent_format())
ggplotly(g)
## `geom_smooth()` using formula 'y ~ x'
majors_processed %>%
select(Major, Total, ShareWomen, Sample_size, Median) %>%
lm(Median ~ ShareWomen, data = ., weights = Sample_size) %>%
summary()
##
## Call:
## lm(formula = Median ~ ShareWomen, data = ., weights = Sample_size)
##
## Weighted Residuals:
## Min 1Q Median 3Q Max
## -260500 -61042 -13899 33262 865081
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 52073 1436 36.255 <2e-16 ***
## ShareWomen -23650 2403 -9.842 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 123000 on 170 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.363, Adjusted R-squared: 0.3592
## F-statistic: 96.87 on 1 and 170 DF, p-value: < 2.2e-16